ID: the unique identification code for every customer
Year_Birth: The Year of a customer’s birth
Education: The level of education that a customer completed
Marital_Status: Status of Marriage
Income: Annual Income
Kidhome: # of children under the age of 13 in Customer’s household
Teenhome: # of children between 13-19 in Customer’s household
Dt_Customer: Date of Customer Enrollment
Recency: # of days since last purchase
MntWines: Dollar amount of Wines purchased in last 2 years
MntFruits: Dollar amount of Fruits purchased in last 2 years
MntMeatProducts: Dollar amount of Meat products purchased in the last 2 years
MntFishProducts: Dollar amount of Fish products purchased in the last 2 years
MntSweetProducts: Dollar amount of Sweet products purchased in the last 2 years
MntGoldProds: Dollar amount of Gold products purchased in the last 2 years
NumDealsPurchases: # of purchases made with discount
NumWebPurchases: # of purchases made through the company’s website
NumCatalogPurchases: # of purchases made using the catalog
NumStorePurchases: # of purchases made directly in-store
NumWebVisitsMonth: # of visits made through company’s website
AcceptedCmp1: 1 if customer accepted the offer in the 1st campaign, 0 otherwise
AcceptedCmp2: 1 if customer accepted the offer in the 2nd campaign, 0 otherwise
AcceptedCmp3: 1 if customer accepted the offer in the 3rd campaign, 0 otherwise
AcceptedCmp4: 1 if customer accepted the offer in the 4th campaign, 0 otherwise
AcceptedCmp5: 1 if customer accepted the offer in the 5th campaign, 0 otherwise
Complain: 1 if customer complained in the last 2 years, 0 otherwise
Response: 1 if customer accepted the offer in the last campaign, 0 otherwise
options(warn = -1)
Sys.setenv(LANGUAGE = "en")
library(tidyverse)
library(plyr) #count()
library(GGally) #ggcorr() and ggpairs()
library(reshape) #melt()
library(corrplot) #corrplot
library(dplyr)
library(vcd)
#second library import
library(randomForest)
library(class)
library(caret)
library(ranger)
library(rsample)
library(e1071)
library(cluster)
library(factoextra)
#import
marketing=read.table(file = "marketing_campaign.csv", fill = TRUE, header = TRUE)
head(marketing)
## ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer
## 1 5524 1957 Graduation Single 58138 0 0 04-09-2012
## 2 2174 1954 Graduation Single 46344 1 1 08-03-2014
## 3 4141 1965 Graduation Together 71613 0 0 21-08-2013
## 4 6182 1984 Graduation Together 26646 1 0 10-02-2014
## 5 5324 1981 PhD Married 58293 1 0 19-01-2014
## 6 7446 1967 Master Together 62513 0 1 09-09-2013
## Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts
## 1 58 635 88 546 172 88
## 2 38 11 1 6 2 1
## 3 26 426 49 127 111 21
## 4 26 11 4 20 10 3
## 5 94 173 43 118 46 27
## 6 16 520 42 98 0 42
## MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases
## 1 88 3 8 10
## 2 6 2 1 1
## 3 42 1 8 2
## 4 5 2 2 0
## 5 15 5 5 3
## 6 14 2 6 4
## NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## 1 4 7 0 0 0
## 2 2 5 0 0 0
## 3 10 4 0 0 0
## 4 4 6 0 0 0
## 5 6 5 0 0 0
## 6 10 6 0 0 0
## AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
## 1 0 0 0 3 11 1
## 2 0 0 0 3 11 0
## 3 0 0 0 3 11 0
## 4 0 0 0 3 11 0
## 5 0 0 0 3 11 0
## 6 0 0 0 3 11 0
str(marketing)
## 'data.frame': 2440 obs. of 29 variables:
## $ ID : int 5524 2174 4141 6182 5324 7446 965 6177 4855 5899 ...
## $ Year_Birth : int 1957 1954 1965 1984 1981 1967 1971 1985 1974 1950 ...
## $ Education : chr "Graduation" "Graduation" "Graduation" "Graduation" ...
## $ Marital_Status : chr "Single" "Single" "Together" "Together" ...
## $ Income : chr "58138" "46344" "71613" "26646" ...
## $ Kidhome : int 0 1 0 1 1 0 0 1 1 1 ...
## $ Teenhome : chr "0" "1" "0" "0" ...
## $ Dt_Customer : chr "04-09-2012" "08-03-2014" "21-08-2013" "10-02-2014" ...
## $ Recency : chr "58" "38" "26" "26" ...
## $ MntWines : int 635 11 426 11 173 520 235 76 14 28 ...
## $ MntFruits : int 88 1 49 4 43 42 65 10 0 0 ...
## $ MntMeatProducts : int 546 6 127 20 118 98 164 56 24 6 ...
## $ MntFishProducts : int 172 2 111 10 46 0 50 3 3 1 ...
## $ MntSweetProducts : int 88 1 21 3 27 42 49 1 3 1 ...
## $ MntGoldProds : int 88 6 42 5 15 14 27 23 2 13 ...
## $ NumDealsPurchases : int 3 2 1 2 5 2 4 2 1 1 ...
## $ NumWebPurchases : int 8 1 8 2 5 6 7 4 3 1 ...
## $ NumCatalogPurchases: int 10 1 2 0 3 4 3 0 0 0 ...
## $ NumStorePurchases : int 4 2 10 4 6 10 7 4 2 0 ...
## $ NumWebVisitsMonth : int 7 5 4 6 5 6 6 8 9 20 ...
## $ AcceptedCmp3 : int 0 0 0 0 0 0 0 0 0 1 ...
## $ AcceptedCmp4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Complain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Z_CostContact : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Z_Revenue : int 11 11 11 11 11 11 11 11 11 11 ...
## $ Response : int 1 0 0 0 0 0 0 0 1 0 ...
nrow(marketing)
## [1] 2440
str(marketing)
## 'data.frame': 2440 obs. of 29 variables:
## $ ID : int 5524 2174 4141 6182 5324 7446 965 6177 4855 5899 ...
## $ Year_Birth : int 1957 1954 1965 1984 1981 1967 1971 1985 1974 1950 ...
## $ Education : chr "Graduation" "Graduation" "Graduation" "Graduation" ...
## $ Marital_Status : chr "Single" "Single" "Together" "Together" ...
## $ Income : chr "58138" "46344" "71613" "26646" ...
## $ Kidhome : int 0 1 0 1 1 0 0 1 1 1 ...
## $ Teenhome : chr "0" "1" "0" "0" ...
## $ Dt_Customer : chr "04-09-2012" "08-03-2014" "21-08-2013" "10-02-2014" ...
## $ Recency : chr "58" "38" "26" "26" ...
## $ MntWines : int 635 11 426 11 173 520 235 76 14 28 ...
## $ MntFruits : int 88 1 49 4 43 42 65 10 0 0 ...
## $ MntMeatProducts : int 546 6 127 20 118 98 164 56 24 6 ...
## $ MntFishProducts : int 172 2 111 10 46 0 50 3 3 1 ...
## $ MntSweetProducts : int 88 1 21 3 27 42 49 1 3 1 ...
## $ MntGoldProds : int 88 6 42 5 15 14 27 23 2 13 ...
## $ NumDealsPurchases : int 3 2 1 2 5 2 4 2 1 1 ...
## $ NumWebPurchases : int 8 1 8 2 5 6 7 4 3 1 ...
## $ NumCatalogPurchases: int 10 1 2 0 3 4 3 0 0 0 ...
## $ NumStorePurchases : int 4 2 10 4 6 10 7 4 2 0 ...
## $ NumWebVisitsMonth : int 7 5 4 6 5 6 6 8 9 20 ...
## $ AcceptedCmp3 : int 0 0 0 0 0 0 0 0 0 1 ...
## $ AcceptedCmp4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Complain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Z_CostContact : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Z_Revenue : int 11 11 11 11 11 11 11 11 11 11 ...
## $ Response : int 1 0 0 0 0 0 0 0 1 0 ...
#columns that will be made numeric
col.names<- names(marketing)
num.cols <- col.names[-c(1,3,4,8)]
marketing <- marketing %>%
mutate_at(num.cols, as.numeric) %>%
mutate(Dt_Customer = as.Date(Dt_Customer, format = "%d-%m-%Y")) #create Date column
head(marketing)
## ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer
## 1 5524 1957 Graduation Single 58138 0 0 2012-09-04
## 2 2174 1954 Graduation Single 46344 1 1 2014-03-08
## 3 4141 1965 Graduation Together 71613 0 0 2013-08-21
## 4 6182 1984 Graduation Together 26646 1 0 2014-02-10
## 5 5324 1981 PhD Married 58293 1 0 2014-01-19
## 6 7446 1967 Master Together 62513 0 1 2013-09-09
## Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts
## 1 58 635 88 546 172 88
## 2 38 11 1 6 2 1
## 3 26 426 49 127 111 21
## 4 26 11 4 20 10 3
## 5 94 173 43 118 46 27
## 6 16 520 42 98 0 42
## MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases
## 1 88 3 8 10
## 2 6 2 1 1
## 3 42 1 8 2
## 4 5 2 2 0
## 5 15 5 5 3
## 6 14 2 6 4
## NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## 1 4 7 0 0 0
## 2 2 5 0 0 0
## 3 10 4 0 0 0
## 4 4 6 0 0 0
## 5 6 5 0 0 0
## 6 10 6 0 0 0
## AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
## 1 0 0 0 3 11 1
## 2 0 0 0 3 11 0
## 3 0 0 0 3 11 0
## 4 0 0 0 3 11 0
## 5 0 0 0 3 11 0
## 6 0 0 0 3 11 0
marketing = na.omit(marketing)
nrow(marketing )
## [1] 2016
str(marketing)
## 'data.frame': 2016 obs. of 29 variables:
## $ ID : int 5524 2174 4141 6182 5324 7446 965 6177 4855 5899 ...
## $ Year_Birth : num 1957 1954 1965 1984 1981 ...
## $ Education : chr "Graduation" "Graduation" "Graduation" "Graduation" ...
## $ Marital_Status : chr "Single" "Single" "Together" "Together" ...
## $ Income : num 58138 46344 71613 26646 58293 ...
## $ Kidhome : num 0 1 0 1 1 0 0 1 1 1 ...
## $ Teenhome : num 0 1 0 0 0 1 1 0 0 1 ...
## $ Dt_Customer : Date, format: "2012-09-04" "2014-03-08" ...
## $ Recency : num 58 38 26 26 94 16 34 32 19 68 ...
## $ MntWines : num 635 11 426 11 173 520 235 76 14 28 ...
## $ MntFruits : num 88 1 49 4 43 42 65 10 0 0 ...
## $ MntMeatProducts : num 546 6 127 20 118 98 164 56 24 6 ...
## $ MntFishProducts : num 172 2 111 10 46 0 50 3 3 1 ...
## $ MntSweetProducts : num 88 1 21 3 27 42 49 1 3 1 ...
## $ MntGoldProds : num 88 6 42 5 15 14 27 23 2 13 ...
## $ NumDealsPurchases : num 3 2 1 2 5 2 4 2 1 1 ...
## $ NumWebPurchases : num 8 1 8 2 5 6 7 4 3 1 ...
## $ NumCatalogPurchases: num 10 1 2 0 3 4 3 0 0 0 ...
## $ NumStorePurchases : num 4 2 10 4 6 10 7 4 2 0 ...
## $ NumWebVisitsMonth : num 7 5 4 6 5 6 6 8 9 20 ...
## $ AcceptedCmp3 : num 0 0 0 0 0 0 0 0 0 1 ...
## $ AcceptedCmp4 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp5 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Complain : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Z_CostContact : num 3 3 3 3 3 3 3 3 3 3 ...
## $ Z_Revenue : num 11 11 11 11 11 11 11 11 11 11 ...
## $ Response : num 1 0 0 0 0 0 0 0 1 0 ...
## - attr(*, "na.action")= 'omit' Named int [1:424] 11 20 21 29 39 40 46 49 50 52 ...
## ..- attr(*, "names")= chr [1:424] "11" "20" "21" "29" ...
count(marketing$Marital_Status)
## x freq
## 1 Absurd 2
## 2 Alone 3
## 3 Divorced 209
## 4 Married 777
## 5 Single 435
## 6 Together 517
## 7 Widow 71
## 8 YOLO 2
Marital_Status currently has 8 different levels, some of them more populated than others and many being repetitive. While, some of the levels can be combined (‘Alone’ and ‘Single’ most likely describe the same experience) others cannot. We don’t know what an Absurd or YOLO Marital Status is and therefore we’ll have to handle these values differently. Thankfully, the unexpected values are few in between and most Customers can be categorized as Coupled (either Married or Together) and a similarly large proportion are Single (whether that be Alone, Single, Divorced, or Widowed).
#Create New Cohesive Categories
marketing$Rel_Status[marketing$Marital_Status %in% c('Alone', 'Divorced', 'Widow', 'Single')] <- 'Single'
marketing$Rel_Status[marketing$Marital_Status %in% c('Married', 'Together')] <- 'Coupled'
marketing$Rel_Status[marketing$Marital_Status %in% c('Absurd', 'YOLO')] <- '' #insert blanks to be handled later
# Drop rows where the value in 'column' is equal to 'condition'
marketing <- subset(marketing,Rel_Status != '')
nrow(marketing)
## [1] 2012
count(marketing$Rel_Status)
## x freq
## 1 Coupled 1294
## 2 Single 718
count(marketing$Education)
## x freq
## 1 Basic 54
## 2 Graduation 1115
## 3 Master 364
## 4 PhD 479
ggplot(marketing, aes(x = Income)) +
geom_boxplot()
outliers <- boxplot(marketing$Income, plot = FALSE)$out
marketing <- marketing %>%
filter(Income < max(outliers) - 1)
#look at unknown variables
summary(marketing$Z_CostContact)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3 3 3 3 3 3
ggplot(marketing, aes(x = Z_CostContact)) +
geom_boxplot()
summary(marketing$Z_Revenue)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11 11 11 11 11 11
ggplot(marketing, aes(x = Z_Revenue)) +
geom_boxplot()
#Date signed up and year of birth
ggplot(marketing, aes(Dt_Customer)) +
geom_density(color = "darkblue", fill = "lightblue") +
geom_vline(aes(xintercept = mean(Dt_Customer)), color = 'red', linetype = 'dashed', linewidth = 1)
ggplot(marketing, aes(Year_Birth)) +
geom_density(color = "darkblue", fill = "lightblue") +
geom_vline(aes(xintercept = mean(Year_Birth)), color = 'red', linetype = 'dashed', linewidth = 1)
A look at variables that refer to a date will give us more context to all of the customers we have.
The red dashed line in these plots represents the average of all customers. The average customer then joined around July of 2013 then and was borned near 1970.
There is little variation in when Customer’s enrolled with our company, but the data seems to be bound to customers and their data between July of 2012 and July of 2014.
Additionally, our company seems to be most populated by the Baby Boomer and X Generations, taking a decline when it comes to Millenials, and have no information on GenZers, though that could be accounted for by the fact that this data ended collection in 2014 when many GenZers were too young to make enrollment
marketing <- marketing %>%
#creating new variables based off old ones
mutate(MntSpent = MntFishProducts + MntMeatProducts + MntFruits + MntSweetProducts + MntWines + MntGoldProds) %>%
mutate(NumPurchases = NumCatalogPurchases + NumStorePurchases + NumWebPurchases) %>%
mutate(MinorsHome = Kidhome + Teenhome) %>%
mutate(AcceptedPrv = AcceptedCmp1 + AcceptedCmp2 + AcceptedCmp3 + AcceptedCmp4 + AcceptedCmp5) %>%
mutate(Age = as.numeric(2023 - Year_Birth)) # Age is the age that registered to be the member, not the current age
#marketing <- marketing[order(marketing$column), ]
# Remove using subset
marketing <- marketing[, -c(1,2,4,6,7,27,28)]
new_order = sort(colnames(marketing))
marketing <- marketing[, new_order]
marketing = na.omit(marketing)
head(marketing)
## AcceptedCmp1 AcceptedCmp2 AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedPrv
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## Age Complain Dt_Customer Education Income MinorsHome MntFishProducts
## 1 66 0 2012-09-04 Graduation 58138 0 172
## 2 69 0 2014-03-08 Graduation 46344 2 2
## 3 58 0 2013-08-21 Graduation 71613 0 111
## 4 39 0 2014-02-10 Graduation 26646 1 10
## 5 42 0 2014-01-19 PhD 58293 1 46
## 6 56 0 2013-09-09 Master 62513 1 0
## MntFruits MntGoldProds MntMeatProducts MntSpent MntSweetProducts MntWines
## 1 88 88 546 1617 88 635
## 2 1 6 6 27 1 11
## 3 49 42 127 776 21 426
## 4 4 5 20 53 3 11
## 5 43 15 118 422 27 173
## 6 42 14 98 716 42 520
## NumCatalogPurchases NumDealsPurchases NumPurchases NumStorePurchases
## 1 10 3 22 4
## 2 1 2 4 2
## 3 2 1 20 10
## 4 0 2 6 4
## 5 3 5 14 6
## 6 4 2 20 10
## NumWebPurchases NumWebVisitsMonth Recency Rel_Status Response
## 1 8 7 58 Single 1
## 2 1 5 38 Single 0
## 3 8 4 26 Coupled 0
## 4 2 6 26 Coupled 0
## 5 5 5 94 Coupled 0
## 6 6 6 16 Coupled 0
#comprehensive boxplots
unwant.cols <- c('AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Complain',
'Education', 'Rel_Status', 'Dt_Customer', 'Response', 'AcceptedPrv')
melt.marketing <- marketing %>%
select(-one_of(unwant.cols)) %>%
melt()
## Using as id variables
ggplot(melt.marketing, aes(factor(variable), value)) +
geom_boxplot(color = 'steelblue') +
facet_wrap(~variable, scale = 'free') +
labs(title = 'Boxplots of Various Variables', x = 'Variables', y = 'Ranges')
#remove outliers from age variable
outliers <- boxplot(marketing$Age, plot = FALSE)$out
marketing <- marketing %>%
filter(Age < min(outliers))
#list of products
products <- c('MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds')
#sum amounts spent on products and set these values in df
products.df <- marketing %>%
select(products) %>% summarize_each(sum) %>%
t() %>% as.data.frame() %>%
rownames_to_column('Products')
#clean up structures
colnames(products.df) <- c('Products', 'Sums')
products <- gsub('Products', '', gsub(c('Mnt'), '', products))
#creating pie chart
ggplot(products.df, aes(x = '', y = Sums, fill = Products)) +
geom_bar(stat = 'identity', width = 1, color = 'black') +
geom_text(aes(label = paste('$', Sums)), color = 'white', position = position_stack(vjust = 0.5)) +
coord_polar('y', start = 0) +
labs(title = 'Percentage of Total Sales from Products', fill = 'Products',
caption = paste('Total Revenue: $', sum(products.df$Sums))) +
scale_fill_discrete(labels = sort(products)) +
theme(axis.ticks=element_blank(), axis.text.y=element_blank(), axis.text.x=element_text(colour='black'),
axis.title=element_blank()) +
scale_y_continuous(breaks = cumsum(products.df$Sums) - products.df$Sums / 2,
labels = paste(round(products.df$Sums/sum(products.df$Sums) * 100, 1), '%'))
purchase <- c('NumCatalogPurchases', 'NumStorePurchases', 'NumWebPurchases')
purchase.df <- marketing %>%
select(purchase) %>% summarize_each(sum) %>%
t() %>% as.data.frame() %>%
rownames_to_column('Place')
colnames(purchase.df) <- c('Place', 'Sums')
purchase <- gsub('Purchases', '', gsub(c('Num'), '', purchase))
ggplot(purchase.df, aes(x = '', y = Sums, fill = Place)) +
geom_bar(stat = 'identity', width = 1, color = 'black') +
geom_text(aes(label = paste(Sums)), color = 'white', position = position_stack(vjust = 0.5)) +
coord_polar('y', start = 0) +
labs(title = 'Percentage of Total Num of Purchases', fill = 'Places',
caption = paste('Total Num: ', sum(purchase.df$Sums))) +
scale_fill_discrete(labels = sort(purchase)) +
theme(axis.ticks=element_blank(), axis.text.y=element_blank(), axis.text.x=element_text(colour='black'),
axis.title=element_blank()) +
scale_y_continuous(breaks = cumsum(purchase.df$Sums) - purchase.df$Sums / 2,
labels = paste(round(purchase.df$Sums/sum(purchase.df$Sums) * 100, 1), '%'))
#correlation plot between numeric vectors
Correlation_plot <- ggcorr(select(marketing, -one_of(unwant.cols)),
geom = 'blank', label = TRUE, hjust = 1.2,wjust = 1, layout.exp = 3) +
geom_point(size = 10, aes(color = coefficient > 0, alpha = abs(coefficient) > 0.6)) +
scale_alpha_manual(values = c('TRUE' = 0.25, 'FALSE' = 0)) +
guides(color = 'none', alpha = 'none') +
labs(title = 'Correlation Map')
Correlation_plot
Here we have a map of the correlation between our numerical variables. Correlations of 0.6 or above are circled in a light blue color while correlations of -0.6 and below are circled in a light red color. There seem to be far more positively correlated variables than there are negatively correlated variables.
Some positive correlations are due to one variable being a sum of many. For example, obviously MntSpent and NumPurchases will be positively correlated with their components. They should also be expected to be positively correlated with each other as well as the more you spend the more number of purchases we should expect a customer to make.
The most positively correlated data that are not so clear to us however include Income to MntSpent, suggesting that as a customer’s Income increases we can expect them to spend more on our products. MntMeatProducts and NumCatalogPurchases are also correlated together, suggesting that many customers purchase our meat products from the catalogue and not in-store or on our website.
Other variables have no significant correlations with any variable- like Age and Recency.
The only negatively correlated relationship we have is between Income and NumWebVisitsMonth. However, Income and NumWebPurchases are not negatively correlated. This indicates that customers with lower incomes are expected to visit our website more but make a similar number of purchases as their higher income counterparts.
We can see the more interesting correlations in individual scatterplots:
#income v mntspent
ggplot(marketing, aes(x = MntSpent, y = Income)) +
geom_point() +
geom_smooth(method = lm) +
labs(title = 'Income Against Amount Spent', x = 'Amount Spent ($)', y = 'Yearly Income ($)')
## `geom_smooth()` using formula = 'y ~ x'
#income by age
ggplot(marketing, aes(x = NumWebVisitsMonth, y = Income)) +
geom_point() +
geom_smooth(method = lm) +
labs(title = 'Income Against Age', x = '# of Web Visits per Month', y = 'Yearly Income ($)')
## `geom_smooth()` using formula = 'y ~ x'
#pie chart of complaints
complaint.counts <- count(marketing$Complain)
ggplot(complaint.counts, aes(x = '', y = freq, fill = as.character(x))) +
geom_bar(stat = 'identity', width = 1) +
coord_polar('y', start = 0) +
labs(title = 'Share of Complaints', subtitle = 'In the last 2 Years') +
scale_fill_discrete(name = "Complant?", labels = c("No", "Yes")) +
theme_void()
#boxplot Income by accepted previous
ggplot(marketing, aes(x = as.character(AcceptedPrv), y = Income)) +
geom_boxplot(color = 'steelblue') +
labs(x = 'Previously Accepted Campaigns')
#boxplot Income by Response
ggplot(marketing, aes(x = as.character(Response), y = Income)) +
geom_boxplot(color = 'steelblue') +
labs(x = 'Response', y = 'Annual Income')
#boxplot Minors Home by accepted previous
ggplot(marketing, aes(x = as.character(AcceptedPrv), y = MinorsHome)) +
geom_boxplot(color = 'steelblue') +
labs(x = 'Previously Accepted Campaigns', y = 'Kids at Home')
#boxplot amount spent by Minors Home
ggplot(marketing, aes(x = as.character(MinorsHome), y = MntSpent)) +
geom_boxplot(color = 'steelblue') +
labs(x = 'Kids at Home', y = 'Amount Spent')
#boxplot Age by accepted previous
ggplot(marketing, aes(x = as.character(AcceptedPrv), y = Age)) +
geom_boxplot(color = 'steelblue') +
labs(x = 'Previously Accepted Campaigns', y = 'Age')
#boxplot Recency by Response
ggplot(marketing, aes(x = as.character(Response), y = Recency)) +
geom_boxplot(color = 'steelblue') +
labs(x = 'Response', y = 'Recency')
ggplot(marketing, aes(x = as.character(AcceptedPrv), fill = Education)) +
geom_bar(position = 'stack') +
labs(x = 'Previously Accepted Campaigns', fill = 'Education')
chisq <- chisq.test(table(marketing$AcceptedPrv, marketing$Education))
chisq
##
## Pearson's Chi-squared test
##
## data: table(marketing$AcceptedPrv, marketing$Education)
## X-squared = 12.976, df = 12, p-value = 0.3708
round(chisq$residuals, 3)
##
## Basic Graduation Master PhD
## 0 0.808 0.195 0.121 -0.675
## 1 -0.684 -0.352 0.079 0.698
## 2 -1.400 -1.015 -0.060 2.074
## 3 -1.050 0.687 -0.523 -0.240
## 4 -0.544 0.771 -0.703 -0.381
This bar chart is interesting for several reasons. On the x axis we see the number of campaigns a customer has previously accepted. Each bar is then subdivided to show the share each education level represents. Many customers have never accepted a campaign before and the distribution tails off with an exceptionally low number of Customers having accepted 4 previous campaigns and none having accepted all 5 previous.
We can conduct a chi-squared test to find if any education level is predisposed to accepting less or more of our previous campaigns. Unfortunately, no statistically significant difference between education levels and previously accepted campaigns is found, meaning that education levels are represented in equal proportions in any of our bars in this chart.
#Relationship by whether Accepted previous campaign
ggplot(marketing, aes(x = as.character(AcceptedPrv), fill = as.character(Response) )) +
geom_bar(position = 'stack') +
labs(x = 'Previously Accepted Campaigns', fill = 'Response')
chisq <- chisq.test(table(marketing$AcceptedPrv, marketing$Response))
chisq
##
## Pearson's Chi-squared test
##
## data: table(marketing$AcceptedPrv, marketing$Response)
## X-squared = 351.6, df = 4, p-value < 2.2e-16
round(chisq$residuals, 3)
##
## 0 1
## 0 2.929 -6.872
## 1 -3.142 7.373
## 2 -3.025 7.098
## 3 -4.363 10.236
## 4 -2.723 6.390
#bar chart of most successful marketing campaign
cmps <- c('AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response')
#considering making a function out of this from how much I use these lines of code
cmp.df <- marketing %>%
select(cmps) %>% summarize_each(sum) %>%
t() %>% as.data.frame() %>%
rownames_to_column('Campaigns') #two columns, one is name of column and the next is totals
#clean up the structure for easier manipulation
cmp.df <- cmp.df %>%
mutate(Percents = V1 / nrow(marketing)) %>% #create percents
select(-V1) #drop sums
#bar plot
ggplot(cmp.df, aes(y = reorder(Campaigns, Percents), x = Percents)) +
geom_bar(stat = 'identity', fill = 'steelblue') +
labs(x = 'Percentage', y = 'Campaigns')
The last visualization we’ll do here is a bar graph of the percentage of customers who accepted each of our campaigns. As we can see, the second campaign did the least well, failing to engage even 5% of our enrolled customers. We might learn the most by discussing the mistakes in that campaign. Other previous campaigns have had similar engagement to one another, having been accepted by more than 5% of the customer’s enrolled.
Unlike the other campaigns, the current campaign has done wonders better, engaging 15% more than other campaigns, a near 300% increase in engagement. We can say matter-of-factly that our current campaign is the most successful while the second campaign was the least successful.
str(marketing)
## 'data.frame': 2010 obs. of 28 variables:
## $ AcceptedCmp1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp3 : num 0 0 0 0 0 0 0 0 0 1 ...
## $ AcceptedCmp4 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp5 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedPrv : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Age : num 66 69 58 39 42 56 52 38 49 73 ...
## $ Complain : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Dt_Customer : Date, format: "2012-09-04" "2014-03-08" ...
## $ Education : chr "Graduation" "Graduation" "Graduation" "Graduation" ...
## $ Income : num 58138 46344 71613 26646 58293 ...
## $ MinorsHome : num 0 2 0 1 1 1 1 1 1 2 ...
## $ MntFishProducts : num 172 2 111 10 46 0 50 3 3 1 ...
## $ MntFruits : num 88 1 49 4 43 42 65 10 0 0 ...
## $ MntGoldProds : num 88 6 42 5 15 14 27 23 2 13 ...
## $ MntMeatProducts : num 546 6 127 20 118 98 164 56 24 6 ...
## $ MntSpent : num 1617 27 776 53 422 ...
## $ MntSweetProducts : num 88 1 21 3 27 42 49 1 3 1 ...
## $ MntWines : num 635 11 426 11 173 520 235 76 14 28 ...
## $ NumCatalogPurchases: num 10 1 2 0 3 4 3 0 0 0 ...
## $ NumDealsPurchases : num 3 2 1 2 5 2 4 2 1 1 ...
## $ NumPurchases : num 22 4 20 6 14 20 17 8 5 1 ...
## $ NumStorePurchases : num 4 2 10 4 6 10 7 4 2 0 ...
## $ NumWebPurchases : num 8 1 8 2 5 6 7 4 3 1 ...
## $ NumWebVisitsMonth : num 7 5 4 6 5 6 6 8 9 20 ...
## $ Recency : num 58 38 26 26 94 16 34 32 19 68 ...
## $ Rel_Status : chr "Single" "Single" "Coupled" "Coupled" ...
## $ Response : num 1 0 0 0 0 0 0 0 1 0 ...
# Perform one-hot encoding on the 'category' column
encoded_data <- model.matrix(~ Education - 1, data = marketing)
ohe_data <- cbind(marketing, encoded_data)
encoded_data <- model.matrix(~ Rel_Status - 1, data = ohe_data)
ohe_data <- cbind(ohe_data , encoded_data)
head( ohe_data )
## AcceptedCmp1 AcceptedCmp2 AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedPrv
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## Age Complain Dt_Customer Education Income MinorsHome MntFishProducts
## 1 66 0 2012-09-04 Graduation 58138 0 172
## 2 69 0 2014-03-08 Graduation 46344 2 2
## 3 58 0 2013-08-21 Graduation 71613 0 111
## 4 39 0 2014-02-10 Graduation 26646 1 10
## 5 42 0 2014-01-19 PhD 58293 1 46
## 6 56 0 2013-09-09 Master 62513 1 0
## MntFruits MntGoldProds MntMeatProducts MntSpent MntSweetProducts MntWines
## 1 88 88 546 1617 88 635
## 2 1 6 6 27 1 11
## 3 49 42 127 776 21 426
## 4 4 5 20 53 3 11
## 5 43 15 118 422 27 173
## 6 42 14 98 716 42 520
## NumCatalogPurchases NumDealsPurchases NumPurchases NumStorePurchases
## 1 10 3 22 4
## 2 1 2 4 2
## 3 2 1 20 10
## 4 0 2 6 4
## 5 3 5 14 6
## 6 4 2 20 10
## NumWebPurchases NumWebVisitsMonth Recency Rel_Status Response EducationBasic
## 1 8 7 58 Single 1 0
## 2 1 5 38 Single 0 0
## 3 8 4 26 Coupled 0 0
## 4 2 6 26 Coupled 0 0
## 5 5 5 94 Coupled 0 0
## 6 6 6 16 Coupled 0 0
## EducationGraduation EducationMaster EducationPhD Rel_StatusCoupled
## 1 1 0 0 0
## 2 1 0 0 0
## 3 1 0 0 1
## 4 1 0 0 1
## 5 0 0 1 1
## 6 0 1 0 1
## Rel_StatusSingle
## 1 1
## 2 1
## 3 0
## 4 0
## 5 0
## 6 0
#ohe_data$Dt_Customer = as.factor(ohe_data$Dt_Customer)
ohe_data = ohe_data[,-c(34,27,10)]
ohe_data$Response = as.factor(ohe_data$Response)
str( ohe_data )
## 'data.frame': 2010 obs. of 31 variables:
## $ AcceptedCmp1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp3 : num 0 0 0 0 0 0 0 0 0 1 ...
## $ AcceptedCmp4 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp5 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedPrv : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Age : num 66 69 58 39 42 56 52 38 49 73 ...
## $ Complain : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Dt_Customer : Date, format: "2012-09-04" "2014-03-08" ...
## $ Income : num 58138 46344 71613 26646 58293 ...
## $ MinorsHome : num 0 2 0 1 1 1 1 1 1 2 ...
## $ MntFishProducts : num 172 2 111 10 46 0 50 3 3 1 ...
## $ MntFruits : num 88 1 49 4 43 42 65 10 0 0 ...
## $ MntGoldProds : num 88 6 42 5 15 14 27 23 2 13 ...
## $ MntMeatProducts : num 546 6 127 20 118 98 164 56 24 6 ...
## $ MntSpent : num 1617 27 776 53 422 ...
## $ MntSweetProducts : num 88 1 21 3 27 42 49 1 3 1 ...
## $ MntWines : num 635 11 426 11 173 520 235 76 14 28 ...
## $ NumCatalogPurchases: num 10 1 2 0 3 4 3 0 0 0 ...
## $ NumDealsPurchases : num 3 2 1 2 5 2 4 2 1 1 ...
## $ NumPurchases : num 22 4 20 6 14 20 17 8 5 1 ...
## $ NumStorePurchases : num 4 2 10 4 6 10 7 4 2 0 ...
## $ NumWebPurchases : num 8 1 8 2 5 6 7 4 3 1 ...
## $ NumWebVisitsMonth : num 7 5 4 6 5 6 6 8 9 20 ...
## $ Recency : num 58 38 26 26 94 16 34 32 19 68 ...
## $ Response : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 2 1 ...
## $ EducationBasic : num 0 0 0 0 0 0 0 0 0 0 ...
## $ EducationGraduation: num 1 1 1 1 0 0 1 0 0 0 ...
## $ EducationMaster : num 0 0 0 0 0 1 0 0 0 0 ...
## $ EducationPhD : num 0 0 0 0 1 0 0 1 1 1 ...
## $ Rel_StatusCoupled : num 0 0 1 1 1 1 0 1 1 1 ...
ohe_data$Campaigns_Accepted = marketing$AcceptedPrv + marketing$Response
# Define the cut points and labels for the categorical variable
cut_points <- c(-1,0.5,10)
labels <- c(0, 1)
# Transform the continuous variable into a categorical variable
ohe_data$Campaigns_Accepted <- cut(ohe_data$Campaigns_Accepted, breaks = cut_points, labels = labels)
str(ohe_data)
## 'data.frame': 2010 obs. of 32 variables:
## $ AcceptedCmp1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp3 : num 0 0 0 0 0 0 0 0 0 1 ...
## $ AcceptedCmp4 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp5 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedPrv : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Age : num 66 69 58 39 42 56 52 38 49 73 ...
## $ Complain : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Dt_Customer : Date, format: "2012-09-04" "2014-03-08" ...
## $ Income : num 58138 46344 71613 26646 58293 ...
## $ MinorsHome : num 0 2 0 1 1 1 1 1 1 2 ...
## $ MntFishProducts : num 172 2 111 10 46 0 50 3 3 1 ...
## $ MntFruits : num 88 1 49 4 43 42 65 10 0 0 ...
## $ MntGoldProds : num 88 6 42 5 15 14 27 23 2 13 ...
## $ MntMeatProducts : num 546 6 127 20 118 98 164 56 24 6 ...
## $ MntSpent : num 1617 27 776 53 422 ...
## $ MntSweetProducts : num 88 1 21 3 27 42 49 1 3 1 ...
## $ MntWines : num 635 11 426 11 173 520 235 76 14 28 ...
## $ NumCatalogPurchases: num 10 1 2 0 3 4 3 0 0 0 ...
## $ NumDealsPurchases : num 3 2 1 2 5 2 4 2 1 1 ...
## $ NumPurchases : num 22 4 20 6 14 20 17 8 5 1 ...
## $ NumStorePurchases : num 4 2 10 4 6 10 7 4 2 0 ...
## $ NumWebPurchases : num 8 1 8 2 5 6 7 4 3 1 ...
## $ NumWebVisitsMonth : num 7 5 4 6 5 6 6 8 9 20 ...
## $ Recency : num 58 38 26 26 94 16 34 32 19 68 ...
## $ Response : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 2 1 ...
## $ EducationBasic : num 0 0 0 0 0 0 0 0 0 0 ...
## $ EducationGraduation: num 1 1 1 1 0 0 1 0 0 0 ...
## $ EducationMaster : num 0 0 0 0 0 1 0 0 0 0 ...
## $ EducationPhD : num 0 0 0 0 1 0 0 1 1 1 ...
## $ Rel_StatusCoupled : num 0 0 1 1 1 1 0 1 1 1 ...
## $ Campaigns_Accepted : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 2 2 ...
we want to divide customers into two groups - engaged and non-engaged.
Set the target attribute to CampaignsAccepted, with
a value of 1 if the customer accepted any offer in previous campaigns,
and 0 otherwise.
# Set the number of folds for cross-validation
k <- 5
# Create empty vectors to store the cross-validation results
accuracy <- numeric(k)
recall <- numeric(k)
precision <- numeric(k)
specificity <- numeric(k)
# Perform k-fold cross-validation
for (i in 1:k) {
# Create training and testing indices for the current fold
test_indices <- ((i - 1) * nrow(ohe_data) / k + 1):(i * nrow(ohe_data) / k)
train_indices <- setdiff(1:nrow(ohe_data), test_indices)
# Subset the data into training and testing sets
train_data <- ohe_data[train_indices, ]
test_data <- ohe_data[test_indices, ]
train_data <- train_data[,-c(1,2,3,4,5,6,9,26)]
test_data <- test_data[,-c(1,2,3,4,5,6,9,26)]
# Train the Random Forest model
model <- randomForest(Campaigns_Accepted ~ ., data = train_data,importance=T, ntree=2000, mtry=4)
varImpPlot(model,main = "Variable Importance Plot",cex = 0.5)
#print(varImp(model,scale=TRUE))
predtreer <- predict(model, test_data)
#Confusion matrix
confusion=table(test_data$Campaigns_Accepted, predtreer);confusion
# Predict the class labels for the testing set
predictions <- predict(model, newdata = test_data)
# Calculate and store the accuracy for the current fold
accuracy[i] <- sum(predictions == test_data$Campaigns_Accepted) / length(predictions)
cm <- confusionMatrix(predictions, test_data$Campaigns_Accepted, positive = '1')
recall[i] <- cm$byClass['Sensitivity']
precision[i] <- cm$byClass['Precision']
specificity[i] <- cm$byClass['Specificity']
}
# Compute the average accuracy across all folds
mean_accuracy <- mean(accuracy)
avg_recall <- mean(recall)
avg_precision <- mean(precision)
avg_specificity <- mean(specificity)
# Print the cross-validation results
print(paste('accuracy:',accuracy))
## [1] "accuracy: 0.823383084577114" "accuracy: 0.818407960199005"
## [3] "accuracy: 0.810945273631841" "accuracy: 0.843283582089552"
## [5] "accuracy: 0.781094527363184"
print(paste('recall:',recall))
## [1] "recall: 0.56" "recall: 0.585106382978723"
## [3] "recall: 0.53030303030303" "recall: 0.564814814814815"
## [5] "recall: 0.430894308943089"
print(paste('precision:',precision))
## [1] "precision: 0.674698795180723" "precision: 0.617977528089888"
## [3] "precision: 0.833333333333333" "precision: 0.792207792207792"
## [5] "precision: 0.746478873239437"
print(paste('specificity:',specificity))
## [1] "specificity: 0.910596026490066" "specificity: 0.88961038961039"
## [3] "specificity: 0.948148148148148" "specificity: 0.945578231292517"
## [5] "specificity: 0.935483870967742"
print(paste('mean accuracy:',mean_accuracy))
## [1] "mean accuracy: 0.815422885572139"
print(paste('mean recall:',avg_recall))
## [1] "mean recall: 0.534223707407932"
print(paste('mean precision:',avg_precision))
## [1] "mean precision: 0.732939264410234"
print(paste('mean specificity:',avg_specificity))
## [1] "mean specificity: 0.925883333301773"
# Perform k-means clustering
k <- 2 # Number of clusters
set.seed(101) # Set a seed for reproducibility
# Specify the column names to extract
# Extract the specified columns from the dataframe
all_data.n <- marketing %>% dplyr::select(where(is.numeric))
#all_data.n$Total_Campaigns_Accepted <- marketing$AcceptedPrv + marketing$Response
#all_data.n$Campaigns_Accepted <- ohe_data$Campaigns_Accepted
all_data.n <- as.data.frame(apply(all_data.n, 2, function(x) (x - min(x)) / (max(x) - min(x))))
kmeans_result_1 = kmeans(all_data.n, centers = k)
# Access the cluster assignments
#cluster_assignments <- kmeans_result_1$cluster
clusplot(all_data.n, kmeans_result_1$cluster, color = T, shade = T, labels = 2, main = "K-means Clustering for K=2 using all numeric variables")
all_data <- marketing
#all_data$Total_Campaigns_Accepted <- marketing$AcceptedPrv + marketing$Response
selected_cols <- c('MntSpent','MntWines','MntMeatProducts','Income','MntGoldProds','NumStorePurchases')
kmeans_data <- all_data.n[, selected_cols]
#kmeans_data$Total_Campaigns_Accepted <- marketing$AcceptedPrv + marketing$Response
#kmeans_data <- as.data.frame(apply(kmeans_data, 2, function(x) (x - min(x)) / (max(x) - min(x))))
kmeans_result_2 = kmeans(kmeans_data, centers = k)
# Access the cluster assignments
cluster_assignments <- kmeans_result_2$cluster
clusplot(kmeans_data, kmeans_result_2$cluster, color = T, shade = T, labels = 2, main = "K-means Clustering for K=2 using variables with high importance score")
# Load required package
# Calculate Silhouette coefficients
silhouette = silhouette(kmeans_result_1$cluster, dist(all_data.n))
# Plot the Silhouette coefficients
# plot(silhouette, main = "Silhouette Plot for k-means Clustering using all numeric variables",col = c("red", "green"))
fviz_silhouette(silhouette)
## cluster size ave.sil.width
## 1 1 1194 0.45
## 2 2 816 0.03
silhouette = silhouette(kmeans_result_2$cluster, dist(kmeans_data))
# Plot the Silhouette coefficients
#plot(silhouette, main = "Silhouette Plot for k-means Clustering using variables with high importance score",col = c("red", "green"))
fviz_silhouette(silhouette)
## cluster size ave.sil.width
## 1 1 1151 0.69
## 2 2 859 0.30
#library(ggplot2)
#library(ggpubr)
# Assuming you have a dataframe named 'data' containing your data, and 'target' is the column containing the class labels
# Install and load the necessary package
# Select the columns for pair plotting
plot_data <- marketing
plot_data$kmeans_cluster <- cluster_assignments
columns_to_plot <- c('MntSpent','MntWines','MntMeatProducts','Income','MntGoldProds','NumStorePurchases')
# Add the 'target' column to the selected columns
columns_to_plot <- c(columns_to_plot, 'kmeans_cluster')
# Subset the data based on the selected columns
data_subset <- plot_data[, columns_to_plot]
data_subset$kmeans_cluster <- as.factor(data_subset$kmeans_cluster)
# Draw pair plots
ggpairs(data_subset, columns = 1:length(columns_to_plot), aes(color = kmeans_cluster)) +
theme(axis.text = element_text(size = 5),
strip.text.x = element_text(size = 7),
strip.text.y = element_text(size = 4)
) # Adjust the font size
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The customer in Cluster 2 tends to
spend more
shop more frequently in stores
have higher Income
plot_data$kmeans_cluster <- factor(plot_data$kmeans_cluster)
ggplot(plot_data, aes(x = as.character(AcceptedPrv), fill = kmeans_cluster)) +
geom_bar(position = 'stack') +
labs(x = 'Previously Accepted Campaigns',
fill = 'Cluster',
title = 'Distribution of Previously Accepted Campaigns Across Clusters')
plot_data$kmeans_cluster <- factor(plot_data$kmeans_cluster)
ggplot(plot_data, aes(x = as.character(Response), fill = kmeans_cluster)) +
geom_bar(position = 'stack') +
labs(x = 'if customer accepted the offer in the last campaign',
fill = 'Cluster',
title = 'Customer Response Distribution by Previous Campaign Acceptance')
#library(ggplot2)
#library(ggpubr)
# Assuming you have a dataframe named 'data' containing your data, and 'target' is the column containing the class labels
# Install and load the necessary package
my_colors <- c("pink", "#339CFF")
# Select the columns for pair plotting
columns_to_plot <- c('MntSpent','MntWines','MntMeatProducts','Income','MntGoldProds','NumStorePurchases')
# Add the 'target' column to the selected columns
columns_to_plot <- c(columns_to_plot, 'Campaigns_Accepted')
# Subset the data based on the selected columns
data_subset <- ohe_data[, columns_to_plot]
data_subset$Campaigns_Accepted <- as.factor(data_subset$Campaigns_Accepted)
# Draw pair plots
ggpairs(data_subset, columns = 1:length(columns_to_plot), aes(color = Campaigns_Accepted)) +
theme(axis.text = element_text(size = 6), # Adjust the font size
strip.text.x = element_text(size = 7),
strip.text.y = element_text(size = 4)
) +
scale_color_manual(values = my_colors) +
scale_fill_manual(values = my_colors)# Set custom color palette
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
CampaignsAcceptedand the variables with high importance
score# Create a continuous variable
data <- marketing
data$Campaigns_Accepted <- ohe_data$Campaigns_Accepted
# Define the cut points and labels for the categorical variable
cut_points <- c(0,800,3000)
labels <- c("Low", "High")
# Transform the continuous variable into a categorical variable
data$MntSpent <- cut(data$MntSpent, breaks = cut_points, labels = labels)
chisq <- chisq.test(table(data$MntSpent, data$Campaigns_Accepted))
chisq
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(data$MntSpent, data$Campaigns_Accepted)
## X-squared = 190.41, df = 1, p-value < 2.2e-16
effect_size <- assocstats(table(data$MntSpent, data$Campaigns_Accepted))
effect_size
## X^2 df P(> X^2)
## Likelihood Ratio 186.44 1 0
## Pearson 191.86 1 0
##
## Phi-Coefficient : 0.309
## Contingency Coeff.: 0.295
## Cramer's V : 0.309
round(chisq$residuals, 3)
##
## 0 1
## Low 4.318 -6.975
## High -5.875 9.489
corrplot(chisq$residuals, is.cor = FALSE)
# Create a continuous variable
data <- marketing
data$Campaigns_Accepted <- ohe_data$Campaigns_Accepted
# Define the cut points and labels for the categorical variable
cut_points <- c(0,500,3000)
labels <- c("Low", "High")
# Transform the continuous variable into a categorical variable
data$MntWines <- cut(data$MntWines, breaks = cut_points, labels = labels)
chisq <- chisq.test(table(data$MntWines, data$Campaigns_Accepted))
chisq
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(data$MntWines, data$Campaigns_Accepted)
## X-squared = 251.54, df = 1, p-value < 2.2e-16
effect_size <- assocstats(table(data$MntWines, data$Campaigns_Accepted))
effect_size
## X^2 df P(> X^2)
## Likelihood Ratio 237.57 1 0
## Pearson 253.34 1 0
##
## Phi-Coefficient : 0.356
## Contingency Coeff.: 0.335
## Cramer's V : 0.356
round(chisq$residuals, 3)
##
## 0 1
## Low 4.318 -6.961
## High -7.194 11.597
corrplot(chisq$residuals, is.cor = FALSE)
# Define the cut points and labels for the categorical variable
cut_points <- c(0,300,3000)
labels <- c("Low", "High")
# Transform the continuous variable into a categorical variable
data$MntMeatProducts <- cut(data$MntMeatProducts, breaks = cut_points, labels = labels)
chisq <- chisq.test(table(data$MntMeatProducts, data$Campaigns_Accepted))
chisq
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(data$MntMeatProducts, data$Campaigns_Accepted)
## X-squared = 128.35, df = 1, p-value < 2.2e-16
effect_size <- assocstats(table(data$MntMeatProducts, data$Campaigns_Accepted))
effect_size
## X^2 df P(> X^2)
## Likelihood Ratio 119.94 1 0
## Pearson 129.76 1 0
##
## Phi-Coefficient : 0.254
## Contingency Coeff.: 0.246
## Cramer's V : 0.254
round(chisq$residuals, 3)
##
## 0 1
## Low 2.702 -4.363
## High -5.353 8.646
corrplot(chisq$residuals, is.cor = FALSE)
# Define the cut points and labels for the categorical variable
cut_points <- c(0,80,1000)
labels <- c("Low", "High")
# Transform the continuous variable into a categorical variable
data$MntGoldProds <- cut(data$MntGoldProds, breaks = cut_points, labels = labels)
chisq <- chisq.test(table(data$MntGoldProds, data$Campaigns_Accepted))
chisq
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(data$MntGoldProds, data$Campaigns_Accepted)
## X-squared = 24.114, df = 1, p-value = 9.08e-07
effect_size <- assocstats(table(data$MntGoldProds, data$Campaigns_Accepted))
effect_size
## X^2 df P(> X^2)
## Likelihood Ratio 23.561 1 1.2101e-06
## Pearson 24.761 1 6.4913e-07
##
## Phi-Coefficient : 0.113
## Contingency Coeff.: 0.112
## Cramer's V : 0.113
round(chisq$residuals, 3)
##
## 0 1
## Low 1.121 -1.814
## High -2.364 3.825
corrplot(chisq$residuals, is.cor = FALSE)
# Define the cut points and labels for the categorical variable
cut_points <- c(0,50000,200000)
labels <- c("Low","High")
# Transform the continuous variable into a categorical variable
data$Income <- cut(data$Income, breaks = cut_points, labels = labels)
chisq <- chisq.test(table(data$Income, data$Campaigns_Accepted))
chisq
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(data$Income, data$Campaigns_Accepted)
## X-squared = 82.409, df = 1, p-value < 2.2e-16
effect_size <- assocstats(table(data$Income, data$Campaigns_Accepted))
effect_size
## X^2 df P(> X^2)
## Likelihood Ratio 85.236 1 0
## Pearson 83.318 1 0
##
## Phi-Coefficient : 0.204
## Contingency Coeff.: 0.2
## Cramer's V : 0.204
round(chisq$residuals, 3)
##
## 0 1
## Low 3.494 -5.644
## High -3.298 5.327
corrplot(chisq$residuals, is.cor = FALSE)
# Define the cut points and labels for the categorical variable
cut_points <- c(0,5,100)
labels <- c("Low", "High")
# Transform the continuous variable into a categorical variable
data$NumStorePurchases <- cut(data$NumStorePurchases, breaks = cut_points, labels = labels)
chisq <- chisq.test(table(data$NumStorePurchases, data$Campaigns_Accepted))
chisq
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(data$NumStorePurchases, data$Campaigns_Accepted)
## X-squared = 61.444, df = 1, p-value = 4.554e-15
effect_size <- assocstats(table(data$NumStorePurchases, data$Campaigns_Accepted))
effect_size
## X^2 df P(> X^2)
## Likelihood Ratio 62.006 1 3.4417e-15
## Pearson 62.235 1 2.9976e-15
##
## Phi-Coefficient : 0.177
## Contingency Coeff.: 0.174
## Cramer's V : 0.177
round(chisq$residuals, 3)
##
## 0 1
## Low 2.761 -4.444
## High -3.116 5.015
corrplot(chisq$residuals, is.cor = FALSE)